libraries

#install.packages("SNPfiltR");data(vcfR.example)
library(SNPfiltR)
## This is SNPfiltR v.1.0.0
## 
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/ 
## 
## If you use SNPfiltR in your published work, please cite the following papers: 
## 
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618 
## 
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(tinytex)
library(vcfR)
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.13.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(ggplot2)

Reading in the data

I am only showing code for the full dataset. Please see table S2 for the specific stats to produce the subsetted datasets.

This code is nearly verbatim from Devon DeRaad’s tutorial for SNPFiltr. Please look at it–there’s much more information there https://github.com/DevonDeRaad/SNPfiltR

At the bottom of this file is instructions to get a whitelist of loci for phylogenetic analyses

# set your working directory
# setwd("~/Dropbox/Publications/2022-McCullough-sacer-gbs/sacer-GBS/analyses/03_final-dataset")
# I put the vcf file from #2 in a folder called "CARC_output"

vcfR <- read.vcfR("CARC-output/sacer63_50per.vcf")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 15057
##   header_line: 15058
##   variant count: 60875
##   column count: 72
## 
Meta line 1000 read in.
Meta line 2000 read in.
Meta line 3000 read in.
Meta line 4000 read in.
Meta line 5000 read in.
Meta line 6000 read in.
Meta line 7000 read in.
Meta line 8000 read in.
Meta line 9000 read in.
Meta line 10000 read in.
Meta line 11000 read in.
Meta line 12000 read in.
Meta line 13000 read in.
Meta line 14000 read in.
Meta line 15000 read in.
Meta line 15057 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 60875
##   Character matrix gt cols: 72
##   skip: 0
##   nrows: 60875
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant: 60875
## All variants processed
### check the metadata present in your vcf
vcfR
## ***** Object of Class vcfR *****
## 63 samples
## 168 CHROMs
## 60,875 variants
## Object size: 281 Mb
## 11.18 percent missing data
## *****        *****         *****
### read in popmap that I used from STACKS, note that I turned it from a tab separated headerless file 
# to a comma separated file with headers "id" and "pop" and put it in the "CARC-output" folder
popmap<-read.csv("CARC-output/sacer63.csv")

Filtering Sacer 63

# Sacer 63 has all the samples. 

# So let's play with filtering!
hard_filter(vcfR=vcfR)
## no depth cutoff provided, exploratory visualization will be generated.

## no genotype quality cutoff provided, exploratory visualization will be generated.

## ***** Object of Class vcfR *****
## 63 samples
## 168 CHROMs
## 60,875 variants
## Object size: 281 Mb
## 11.18 percent missing data
## *****        *****         *****
#hard filter to minimum depth of 5, and minimum genotype quality of 30
vcfR<-hard_filter(vcfR=vcfR, depth = 5, gq = 30)
## 10.9% of genotypes fall below a read depth of 5 and were converted to NA
## 0.76% of genotypes fall below a genotype quality of 30 and were converted to NA

Allele balance

#From the SNP filtering tutorial:
# “Allele balance: a number between 0 and 1 representing the ratio of reads showing the 
# reference allele to all reads, considering only reads from individuals called as 
# heterozygous, we expect that the allele balance in our data (for real loci) should
#  be close to 0.5”

#the SNPfiltR allele balance function will convert heterozygous genotypes to 
# missing if they fall outside of the .25-.75 range.
#execute allele balance filter
vcfR<-filter_allele_balance(vcfR)
## 9.29% of het genotypes (0.28% of all genotypes) fall outside of 0.25 - 0.75 allele balance ratio and were converted to NA

Removing low quality samples

#visualize and pick appropriate max depth cutoff
# Now we can execute a max depth filter (super high depth loci are likely multiple loci 
# stuck together into a single paralogous locus).This filter is applied ‘per SNP’ rather 
# than ‘per genotype’ otherwise we would simply be removing most of the genotypes from 
# our deepest sequenced samples (because sequencing depth is so variable between samples).
# By filtering per SNP, we remove the SNPs with outlier depth values, which are most 
#likely to be spuriously mapped/built paralagous loci.

max_depth(vcfR)
## cutoff is not specified, exploratory visualization will be generated.

## dashed line indicates a mean depth across all SNPs of 132.5

# filter vcf by the max depth cutoff you chose
vcfR<-max_depth(vcfR, maxdepth = 157)
## maxdepth cutoff is specified, filtered vcfR object will be returned
## 34.89% of SNPs were above a mean depth of 157 and were removed from the vcf

#check vcfR to see how many SNPs we have left
vcfR
## ***** Object of Class vcfR *****
## 63 samples
## 149 CHROMs
## 39,634 variants
## Object size: 142.8 Mb
## 30.98 percent missing data
## *****        *****         *****
#Set arbitrary cutoff for missing data allowed per sample. 
#We will start by determining which samples contain too few sequences to be used
# in downstream analyses, by visualizing missing data per sample
#run function to visualize samples
missing_by_sample(vcfR=vcfR, popmap = popmap)
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

##         indiv filt snps.retained
## 1   viti24248  0.5         18346
## 2   viti24247  0.5         24136
## 3   viti30489  0.5         22516
## 4   viti30504  0.5         26385
## 5   viti30468  0.5         24722
## 6   viti30467  0.5         25405
## 7   viti30470  0.5         27579
## 8   viti30490  0.5         24867
## 9   mari26392  0.5         22844
## 10  mari26383  0.5         20448
## 11  mari26394  0.5         24870
## 12  mari26391  0.5         26971
## 13  mari26382  0.5         26397
## 14  mari26405  0.5         21400
## 15  mari26408  0.5         21131
## 16  mari26406  0.5         25516
## 17  mari26342  0.5         19941
## 18  mari26347  0.5         23528
## 19  mari26338  0.5         26721
## 20  mari26348  0.5         24185
## 21  mari26439  0.5         25171
## 22  exim24403  0.5         18788
## 23  exim25219  0.5         18678
## 24  exim25227  0.5         19135
## 25  exim25206  0.5         27770
## 26  peal89771  0.5         27649
## 27  juli21453  0.5         20373
## 28  juli21486  0.5         20732
## 29  juli21504  0.5         27527
## 30  sant21522  0.5         22163
## 31  sant21429  0.5         25186
## 32  sant21519  0.5         23213
## 33  sant45831  0.5         20048
## 34  sant21626  0.5         26090
## 35  orna19404  0.5         22374
## 36  solo12834  0.5         19835
## 37  solo35056  0.5         20340
## 38  solo15921  0.5         26604
## 39  solo35081  0.5         27890
## 40  solo35037  0.5         27488
## 41  amoe35602  0.5         19752
## 42  amoe35570  0.5         24301
## 43  amoe35585  0.5         27353
## 44  amoe35579  0.5         27744
## 45  amoe35565  0.5         27984
## 46  pele23651  0.5         24282
## 47  pele23662  0.5         19165
## 48  farq21480  0.5         26528
## 49  farq21509  0.5         24956
## 50  farq21584  0.5         26397
## 51  farq21586  0.5         21251
## 52  farq21587  0.5         24746
## 53  lpyg32019  0.5         19026
## 54  lpyg34505  0.5         22530
## 55   lpyg6654  0.5         19824
## 56   cin16774  0.5         20229
## 57   cin22024  0.5         25418
## 58  cinB25306  0.5         23344
## 59  rufi42805  0.5         20670
## 60  mauk42604  0.5         20940
## 61  mauk42603  0.5         23425
## 62  atiu42504  0.5         24491
## 63   gertO343  0.5         25728
## 64  viti24248  0.6         17500
## 65  viti24247  0.6         22308
## 66  viti30489  0.6         20974
## 67  viti30504  0.6         23809
## 68  viti30468  0.6         22728
## 69  viti30467  0.6         23228
## 70  viti30470  0.6         24367
## 71  viti30490  0.6         22757
## 72  mari26392  0.6         21410
## 73  mari26383  0.6         19189
## 74  mari26394  0.6         22666
## 75  mari26391  0.6         24099
## 76  mari26382  0.6         23606
## 77  mari26405  0.6         20100
## 78  mari26408  0.6         19714
## 79  mari26406  0.6         23290
## 80  mari26342  0.6         18787
## 81  mari26347  0.6         21888
## 82  mari26338  0.6         23911
## 83  mari26348  0.6         22194
## 84  mari26439  0.6         22878
## 85  exim24403  0.6         18022
## 86  exim25219  0.6         17608
## 87  exim25227  0.6         18051
## 88  exim25206  0.6         24465
## 89  peal89771  0.6         24398
## 90  juli21453  0.6         19380
## 91  juli21486  0.6         19625
## 92  juli21504  0.6         24403
## 93  sant21522  0.6         20872
## 94  sant21429  0.6         23080
## 95  sant21519  0.6         21702
## 96  sant45831  0.6         18853
## 97  sant21626  0.6         23633
## 98  orna19404  0.6         20939
## 99  solo12834  0.6         18354
## 100 solo35056  0.6         19426
## 101 solo15921  0.6         23539
## 102 solo35081  0.6         24545
## 103 solo35037  0.6         24293
## 104 amoe35602  0.6         18893
## 105 amoe35570  0.6         22505
## 106 amoe35585  0.6         24333
## 107 amoe35579  0.6         24466
## 108 amoe35565  0.6         24621
## 109 pele23651  0.6         22071
## 110 pele23662  0.6         18203
## 111 farq21480  0.6         23382
## 112 farq21509  0.6         22558
## 113 farq21584  0.6         23302
## 114 farq21586  0.6         19957
## 115 farq21587  0.6         22328
## 116 lpyg32019  0.6         17681
## 117 lpyg34505  0.6         19987
## 118  lpyg6654  0.6         17833
## 119  cin16774  0.6         19076
## 120  cin22024  0.6         22776
## 121 cinB25306  0.6         21523
## 122 rufi42805  0.6         19549
## 123 mauk42604  0.6         19809
## 124 mauk42603  0.6         21595
## 125 atiu42504  0.6         22346
## 126  gertO343  0.6         23126
## 127 viti24248  0.7         16191
## 128 viti24247  0.7         19613
## 129 viti30489  0.7         18618
## 130 viti30504  0.7         20385
## 131 viti30468  0.7         19817
## 132 viti30467  0.7         20176
## 133 viti30470  0.7         20515
## 134 viti30490  0.7         19924
## 135 mari26392  0.7         19207
## 136 mari26383  0.7         17345
## 137 mari26394  0.7         19780
## 138 mari26391  0.7         20406
## 139 mari26382  0.7         20247
## 140 mari26405  0.7         18086
## 141 mari26408  0.7         17853
## 142 mari26406  0.7         20171
## 143 mari26342  0.7         17029
## 144 mari26347  0.7         19435
## 145 mari26338  0.7         20419
## 146 mari26348  0.7         19467
## 147 mari26439  0.7         19831
## 148 exim24403  0.7         16871
## 149 exim25219  0.7         16023
## 150 exim25227  0.7         16430
## 151 exim25206  0.7         20569
## 152 peal89771  0.7         20507
## 153 juli21453  0.7         17775
## 154 juli21486  0.7         17971
## 155 juli21504  0.7         20587
## 156 sant21522  0.7         18792
## 157 sant21429  0.7         20026
## 158 sant21519  0.7         19268
## 159 sant45831  0.7         16946
## 160 sant21626  0.7         20244
## 161 orna19404  0.7         18772
## 162 solo12834  0.7         16418
## 163 solo35056  0.7         17904
## 164 solo15921  0.7         20011
## 165 solo35081  0.7         20622
## 166 solo35037  0.7         20501
## 167 amoe35602  0.7         17423
## 168 amoe35570  0.7         19784
## 169 amoe35585  0.7         20562
## 170 amoe35579  0.7         20584
## 171 amoe35565  0.7         20655
## 172 pele23651  0.7         19158
## 173 pele23662  0.7         16720
## 174 farq21480  0.7         19724
## 175 farq21509  0.7         19426
## 176 farq21584  0.7         19707
## 177 farq21586  0.7         18061
## 178 farq21587  0.7         19269
## 179 lpyg32019  0.7         15693
## 180 lpyg34505  0.7         17011
## 181  lpyg6654  0.7         15304
## 182  cin16774  0.7         17286
## 183  cin22024  0.7         19401
## 184 cinB25306  0.7         18956
## 185 rufi42805  0.7         17779
## 186 mauk42604  0.7         17885
## 187 mauk42603  0.7         18992
## 188 atiu42504  0.7         19418
## 189  gertO343  0.7         19718
## 190 viti24248  0.8         14570
## 191 viti24247  0.8         16791
## 192 viti30489  0.8         16161
## 193 viti30504  0.8         17090
## 194 viti30468  0.8         16912
## 195 viti30467  0.8         17106
## 196 viti30470  0.8         17142
## 197 viti30490  0.8         16996
## 198 mari26392  0.8         16686
## 199 mari26383  0.8         15292
## 200 mari26394  0.8         16913
## 201 mari26391  0.8         17068
## 202 mari26382  0.8         17064
## 203 mari26405  0.8         15741
## 204 mari26408  0.8         15750
## 205 mari26406  0.8         17052
## 206 mari26342  0.8         15107
## 207 mari26347  0.8         16736
## 208 mari26338  0.8         17089
## 209 mari26348  0.8         16676
## 210 mari26439  0.8         16868
## 211 exim24403  0.8         15286
## 212 exim25219  0.8         14293
## 213 exim25227  0.8         14611
## 214 exim25206  0.8         17166
## 215 peal89771  0.8         17097
## 216 juli21453  0.8         15790
## 217 juli21486  0.8         15999
## 218 juli21504  0.8         17169
## 219 sant21522  0.8         16384
## 220 sant21429  0.8         16999
## 221 sant21519  0.8         16693
## 222 sant45831  0.8         14892
## 223 sant21626  0.8         17017
## 224 orna19404  0.8         16289
## 225 solo12834  0.8         14438
## 226 solo35056  0.8         15917
## 227 solo15921  0.8         16929
## 228 solo35081  0.8         17196
## 229 solo35037  0.8         17151
## 230 amoe35602  0.8         15613
## 231 amoe35570  0.8         16928
## 232 amoe35585  0.8         17162
## 233 amoe35579  0.8         17167
## 234 amoe35565  0.8         17200
## 235 pele23651  0.8         16315
## 236 pele23662  0.8         14888
## 237 farq21480  0.8         16492
## 238 farq21509  0.8         16427
## 239 farq21584  0.8         16506
## 240 farq21586  0.8         15825
## 241 farq21587  0.8         16344
## 242 lpyg32019  0.8         13737
## 243 lpyg34505  0.8         14563
## 244  lpyg6654  0.8         13172
## 245  cin16774  0.8         15227
## 246  cin22024  0.8         16371
## 247 cinB25306  0.8         16332
## 248 rufi42805  0.8         15659
## 249 mauk42604  0.8         15801
## 250 mauk42603  0.8         16327
## 251 atiu42504  0.8         16476
## 252  gertO343  0.8         16581
## 253 viti24248  0.9         11950
## 254 viti24247  0.9         12891
## 255 viti30489  0.9         12570
## 256 viti30504  0.9         12967
## 257 viti30468  0.9         12920
## 258 viti30467  0.9         12988
## 259 viti30470  0.9         12981
## 260 viti30490  0.9         12965
## 261 mari26392  0.9         12918
## 262 mari26383  0.9         12252
## 263 mari26394  0.9         12936
## 264 mari26391  0.9         12986
## 265 mari26382  0.9         12968
## 266 mari26405  0.9         12426
## 267 mari26408  0.9         12484
## 268 mari26406  0.9         12974
## 269 mari26342  0.9         12234
## 270 mari26347  0.9         12905
## 271 mari26338  0.9         12969
## 272 mari26348  0.9         12848
## 273 mari26439  0.9         12893
## 274 exim24403  0.9         12279
## 275 exim25219  0.9         11655
## 276 exim25227  0.9         11744
## 277 exim25206  0.9         12973
## 278 peal89771  0.9         12942
## 279 juli21453  0.9         12600
## 280 juli21486  0.9         12634
## 281 juli21504  0.9         12958
## 282 sant21522  0.9         12719
## 283 sant21429  0.9         12925
## 284 sant21519  0.9         12882
## 285 sant45831  0.9         11864
## 286 sant21626  0.9         12942
## 287 orna19404  0.9         12714
## 288 solo12834  0.9         11768
## 289 solo35056  0.9         12682
## 290 solo15921  0.9         12944
## 291 solo35081  0.9         13001
## 292 solo35037  0.9         12995
## 293 amoe35602  0.9         12475
## 294 amoe35570  0.9         12958
## 295 amoe35585  0.9         12995
## 296 amoe35579  0.9         12993
## 297 amoe35565  0.9         13005
## 298 pele23651  0.9         12580
## 299 pele23662  0.9         11966
## 300 farq21480  0.9         12726
## 301 farq21509  0.9         12720
## 302 farq21584  0.9         12691
## 303 farq21586  0.9         12520
## 304 farq21587  0.9         12662
## 305 lpyg32019  0.9         10994
## 306 lpyg34505  0.9         11555
## 307  lpyg6654  0.9         10539
## 308  cin16774  0.9         12173
## 309  cin22024  0.9         12602
## 310 cinB25306  0.9         12656
## 311 rufi42805  0.9         12457
## 312 mauk42604  0.9         12547
## 313 mauk42603  0.9         12758
## 314 atiu42504  0.9         12691
## 315  gertO343  0.9         12763
## 316 viti24248  1.0          4102
## 317 viti24247  1.0          4102
## 318 viti30489  1.0          4102
## 319 viti30504  1.0          4102
## 320 viti30468  1.0          4102
## 321 viti30467  1.0          4102
## 322 viti30470  1.0          4102
## 323 viti30490  1.0          4102
## 324 mari26392  1.0          4102
## 325 mari26383  1.0          4102
## 326 mari26394  1.0          4102
## 327 mari26391  1.0          4102
## 328 mari26382  1.0          4102
## 329 mari26405  1.0          4102
## 330 mari26408  1.0          4102
## 331 mari26406  1.0          4102
## 332 mari26342  1.0          4102
## 333 mari26347  1.0          4102
## 334 mari26338  1.0          4102
## 335 mari26348  1.0          4102
## 336 mari26439  1.0          4102
## 337 exim24403  1.0          4102
## 338 exim25219  1.0          4102
## 339 exim25227  1.0          4102
## 340 exim25206  1.0          4102
## 341 peal89771  1.0          4102
## 342 juli21453  1.0          4102
## 343 juli21486  1.0          4102
## 344 juli21504  1.0          4102
## 345 sant21522  1.0          4102
## 346 sant21429  1.0          4102
## 347 sant21519  1.0          4102
## 348 sant45831  1.0          4102
## 349 sant21626  1.0          4102
## 350 orna19404  1.0          4102
## 351 solo12834  1.0          4102
## 352 solo35056  1.0          4102
## 353 solo15921  1.0          4102
## 354 solo35081  1.0          4102
## 355 solo35037  1.0          4102
## 356 amoe35602  1.0          4102
## 357 amoe35570  1.0          4102
## 358 amoe35585  1.0          4102
## 359 amoe35579  1.0          4102
## 360 amoe35565  1.0          4102
## 361 pele23651  1.0          4102
## 362 pele23662  1.0          4102
## 363 farq21480  1.0          4102
## 364 farq21509  1.0          4102
## 365 farq21584  1.0          4102
## 366 farq21586  1.0          4102
## 367 farq21587  1.0          4102
## 368 lpyg32019  1.0          4102
## 369 lpyg34505  1.0          4102
## 370  lpyg6654  1.0          4102
## 371  cin16774  1.0          4102
## 372  cin22024  1.0          4102
## 373 cinB25306  1.0          4102
## 374 rufi42805  1.0          4102
## 375 mauk42604  1.0          4102
## 376 mauk42603  1.0          4102
## 377 atiu42504  1.0          4102
## 378  gertO343  1.0          4102
#run function to drop samples above the threshold we want from the vcf
vcfR<-missing_by_sample(vcfR=vcfR, cutoff = .55)
## 0 samples are above a 0.55 missing data cutoff, and were removed from VCF

#subset popmap to only include retained individuals
popmap<-popmap[popmap$id %in% colnames(vcfR@gt),]

dealing with missing data

#remove invariant sites generated by dropping individuals
vcfR<-min_mac(vcfR, min.mac = 1)
## 13.73% of SNPs fell below a minor allele count of 1 and were removed from the VCF

#verify that missing data is not driving clustering patterns among the retained samples
miss<-assess_missing_data_pca(vcfR=vcfR, popmap = popmap, thresholds = .8, clustering = FALSE)
## cutoff is specified, filtered vcfR object will be returned
## 51.66% of SNPs fell below a completeness cutoff of 0.8 and were removed from the VCF
## Loading required namespace: adegenet

#Set arbitrary cutoff for missing data allowed per SNP. 
#We can visualize the effect that typical missing data cutoffs will have on both
# the number of SNPs retained and the total missing data in our entire dataset.We
#want to choose a cutoff that minimizes the overall missing data in the dataset, 
#while maximizing the total number of loci retained.

#visualize missing data by SNP and the effect of various cutoffs on the missingnessof each sample
missing_by_snp(vcfR)
## cutoff is not specified, exploratory visualizations will be generated
## Picking joint bandwidth of 0.0586

##    filt missingness snps.retained
## 1  0.30  0.24541314         32074
## 2  0.50  0.16625169         26240
## 3  0.60  0.12853173         23190
## 4  0.65  0.11264472         21818
## 5  0.70  0.08905816         19636
## 6  0.75  0.07461574         18188
## 7  0.80  0.05979870         16529
## 8  0.85  0.04599891         14745
## 9  0.90  0.03278357         12652
## 10 0.95  0.01701234          9418
## 11 1.00  0.00000000          4102
#verify that missing data is not driving clustering patterns among the retained samples at some reasonable thresholds
miss<-assess_missing_data_pca(vcfR=vcfR, popmap = popmap, thresholds = c(.7,.8,.85), clustering = FALSE)
## cutoff is specified, filtered vcfR object will be returned
## 42.57% of SNPs fell below a completeness cutoff of 0.7 and were removed from the VCF

## cutoff is specified, filtered vcfR object will be returned

## 51.66% of SNPs fell below a completeness cutoff of 0.8 and were removed from the VCF

## cutoff is specified, filtered vcfR object will be returned

## 56.88% of SNPs fell below a completeness cutoff of 0.85 and were removed from the VCF

miss<-assess_missing_data_pca(vcfR=vcfR, popmap = popmap, thresholds = c(.9,.95, 1.0), clustering = FALSE)
## cutoff is specified, filtered vcfR object will be returned
## 63% of SNPs fell below a completeness cutoff of 0.9 and were removed from the VCF

## cutoff is specified, filtered vcfR object will be returned

## 72.46% of SNPs fell below a completeness cutoff of 0.95 and were removed from the VCF

## cutoff is specified, filtered vcfR object will be returned

## 88% of SNPs fell below a completeness cutoff of 1 and were removed from the VCF

#choose a value that retains an acceptable amount of missing data in each sample, and maximizes SNPs retained while minimizing overall missing data, and filter vcf
vcfR<-missing_by_snp(vcfR, cutoff = .95)
## cutoff is specified, filtered vcfR object will be returned
## 72.46% of SNPs fell below a completeness cutoff of 0.95 and were removed from the VCF

#check how many SNPs and samples are left
vcfR
## ***** Object of Class vcfR *****
## 63 samples
## 90 CHROMs
## 9,418 variants
## Object size: 54.7 Mb
## 1.701 percent missing data
## *****        *****         *****
#

investigate the effect of a minor allele count (MAC) cutoff on downstream inferences.

#MAC/MAF cutoffs can be helpful in removing spurious and uninformative loci from
#the dataset, but also have the potential to bias downstream inferences. Linck 
#and Battey (2019) have an excellent paper on just this topic. 
  
#Our package contains a convenient wrapper functions that can filter based on 
#minor allele count (MAC) and streamline investigation of the effects of various
#filtering parameters on sample clustering patterns.

#investigate clustering patterns with and without a minor allele cutoff
#use min.mac() to investigate the effect of multiple cutoffs
vcfR.mac<-min_mac(vcfR = vcfR, min.mac = 2)
## 19.81% of SNPs fell below a minor allele count of 2 and were removed from the VCF

#assess clustering without MAC cutoff
miss<-assess_missing_data_tsne(vcfR, popmap, clustering = FALSE)

#assess clustering with MAC cutoff
miss<-assess_missing_data_tsne(vcfR.mac, popmap, clustering = FALSE)

#based on these visualizations, singletons are not obviously biasing clustering
# patterns, so I will leave them in for now. If I want to run a program like 
# STRUCTURE, where singletons are known to bias inference, I can write out the 
#vcf with singletons removed as well:
#vcfR::write.vcf(vcfR.mac, file = "output/sacer-no-singletons.146.vcf.gz")

Quality per SNP and sample

# Finally, we will make sure that the depth and genotype quality look consistent
# across SNPs and samples, following our filtering pipeline.
#plot depth per snp and per sample
dp <- extract.gt(vcfR, element = "DP", as.numeric=TRUE)
heatmap.bp(dp, rlabels = FALSE)

#plot genotype quality per snp and per sample
gq <- extract.gt(vcfR, element = "GQ", as.numeric=TRUE)
heatmap.bp(gq, rlabels = FALSE)

Write out vcf files for downstream analyses

# We can use the distance_thin() function from the SNPfiltR package
# in order to filter our SNPs to a minimum distance between SNPs, in order to 
# get a set of unlinked SNPs for downsteam analyses. 

#1. ALL SNPS
#write out vcf with all SNPs (uncomment when ready)
#vcfR::write.vcf(vcfR, "SNPfiltR-output/sacer.63.all_filtered.vcf.gz")
#vcfR

#2. UNLINKED SNPS
# write out a vcf file with thinned SNPs, here one per 500 bp
#vcfR.thin<-distance_thin(vcfR, min.distance = 500)
#vcfR.thin
#write out thinned vcf, uncomment when ready. 
#vcfR::write.vcf(vcfR.thin, "SNPfiltR-output/sacer.63.filtered.thinned.vcf.gz")

#3. ALL SNPS, NO SINGLETONS (for structure/smnf plots)
# Since singletons are known to bias inference with structure like lots, I will 
# produce a dataset with no singletons. 
vcfR.mac
## ***** Object of Class vcfR *****
## 63 samples
## 90 CHROMs
## 7,552 variants
## Object size: 44.4 Mb
## 1.709 percent missing data
## *****        *****         *****
#vcfR::write.vcf(vcfR.mac, file = "SNPfiltR-output/sacer.63.no-singletons.vcf.gz")

getting a white list for downstream phylogenetic analyses

# if i wanted to start here later, i could download my filtered and thinned 
# vcf file like below. Otherwise, i could just use the "vcfR.thin" instead. Up to you. 

#vcfR_final<-read.vcfR("SNPfiltR-output/sacer.63.filtered.thinned.vcf.gz")
#vcfR_final
#
##check out the metadata
#head(vcfR_final@fix)
#
##we want to isolate the third column, which contains the name of each locus
#head(vcfR_final@fix[,3])
#
##get a list of just locus names to use as whitelist for stacks
#whitelist<-sub(":.*", "", vcfR_final@fix[,3])
#
##make sure each locus is unique
#length(unique(whitelist)) == length(whitelist)
#
##make sure the locus names look right
#whitelist[1:5]
#View(whitelist)
#
##write out whitelist for stacks
#write.table(whitelist, file = "SNPfiltR-output/sacer63_filteredSNPs.whitelist.txt", quote = F, row.names = F, col.names = F)
#
##generate popmap including only the samples in this  vcf file 
#df<-data.frame(ind=colnames(vcfR_final@gt)[-1], pop=gsub("_.*","", x=gsub("T_","",x = colnames(vcfR_final@gt)[-1])))
#
##write out popmap for stacks
#
#write.table(df, file = "SNPfiltR-output/sacer63_filteredSNPs.popmap.txt", quote = F, row.names = F, col.names = F, sep = "\t")
#
# in the same way you ran populations originally (from step #2), run this code instead (with no #)
#populations -P $src/populations_out/sacer63 -M $src/popmaps/sacer63.txt -O $src/populations_out/pop_sacer63_50per --whitelist popmaps/sacer63_filteredSNPs.whitelist.txt --phylip-var-all -t 8